# import python packages which we need for this project
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # visualizing data
%matplotlib inline
import seaborn as sns
మన భారతదేశంలో దీపావళి అమ్మకాలు ఎలా ఉండాలని, దీపావళి సేల్స్ను ప్రభావితం చేసే కొన్ని అంశాలు పరిగిణించుకుంటూ అన్వేషణాత్మక డేటా విశ్లేషణ
# import csv file for this
= pd.read_csv('Diwali Sales Data.csv', encoding= 'unicode_escape') df
df.shape
(11251, 15)
df.head()
User_ID | Cust_name | Product_ID | Gender | Age Group | Age | Marital_Status | State | Zone | Occupation | Product_Category | Orders | Amount | Status | unnamed1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1002903 | Sanskriti | P00125942 | F | 26-35 | 28 | 0 | Maharashtra | Western | Healthcare | Auto | 1 | 23952.0 | NaN | NaN |
1 | 1000732 | Kartik | P00110942 | F | 26-35 | 35 | 1 | Andhra Pradesh | Southern | Govt | Auto | 3 | 23934.0 | NaN | NaN |
2 | 1001990 | Bindu | P00118542 | F | 26-35 | 35 | 1 | Uttar Pradesh | Central | Automobile | Auto | 3 | 23924.0 | NaN | NaN |
3 | 1001425 | Sudevi | P00237842 | M | 0-17 | 16 | 0 | Karnataka | Southern | Construction | Auto | 2 | 23912.0 | NaN | NaN |
4 | 1000588 | Joni | P00057942 | M | 26-35 | 28 | 1 | Gujarat | Western | Food Processing | Auto | 2 | 23877.0 | NaN | NaN |
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11251 entries, 0 to 11250
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 User_ID 11251 non-null int64
1 Cust_name 11251 non-null object
2 Product_ID 11251 non-null object
3 Gender 11251 non-null object
4 Age Group 11251 non-null object
5 Age 11251 non-null int64
6 Marital_Status 11251 non-null int64
7 State 11251 non-null object
8 Zone 11251 non-null object
9 Occupation 11251 non-null object
10 Product_Category 11251 non-null object
11 Orders 11251 non-null int64
12 Amount 11239 non-null float64
13 Status 0 non-null float64
14 unnamed1 0 non-null float64
dtypes: float64(3), int64(4), object(8)
memory usage: 1.3+ MB
#drop unrelated/blank columns
'Status', 'unnamed1'], axis=1, inplace=True) df.drop([
#check for null values
sum() pd.isnull(df).
User_ID 0
Cust_name 0
Product_ID 0
Gender 0
Age Group 0
Age 0
Marital_Status 0
State 0
Zone 0
Occupation 0
Product_Category 0
Orders 0
Amount 12
dtype: int64
# drop null values
=True) df.dropna(inplace
# change data type
'Amount'] = df['Amount'].astype('int') df[
'Amount'].dtypes df[
dtype('int32')
df.columns
Index(['User_ID', 'Cust_name', 'Product_ID', 'Gender', 'Age Group', 'Age',
'Marital_Status', 'State', 'Zone', 'Occupation', 'Product_Category',
'Orders', 'Amount'],
dtype='object')
#rename column
= {'Marital_Status':'Shaadi'}) df.rename(columns
User_ID | Cust_name | Product_ID | Gender | Age Group | Age | Shaadi | State | Zone | Occupation | Product_Category | Orders | Amount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1002903 | Sanskriti | P00125942 | F | 26-35 | 28 | 0 | Maharashtra | Western | Healthcare | Auto | 1 | 23952 |
1 | 1000732 | Kartik | P00110942 | F | 26-35 | 35 | 1 | Andhra Pradesh | Southern | Govt | Auto | 3 | 23934 |
2 | 1001990 | Bindu | P00118542 | F | 26-35 | 35 | 1 | Uttar Pradesh | Central | Automobile | Auto | 3 | 23924 |
3 | 1001425 | Sudevi | P00237842 | M | 0-17 | 16 | 0 | Karnataka | Southern | Construction | Auto | 2 | 23912 |
4 | 1000588 | Joni | P00057942 | M | 26-35 | 28 | 1 | Gujarat | Western | Food Processing | Auto | 2 | 23877 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
11246 | 1000695 | Manning | P00296942 | M | 18-25 | 19 | 1 | Maharashtra | Western | Chemical | Office | 4 | 370 |
11247 | 1004089 | Reichenbach | P00171342 | M | 26-35 | 33 | 0 | Haryana | Northern | Healthcare | Veterinary | 3 | 367 |
11248 | 1001209 | Oshin | P00201342 | F | 36-45 | 40 | 0 | Madhya Pradesh | Central | Textile | Office | 4 | 213 |
11249 | 1004023 | Noonan | P00059442 | M | 36-45 | 37 | 0 | Karnataka | Southern | Agriculture | Office | 3 | 206 |
11250 | 1002744 | Brumley | P00281742 | F | 18-25 | 19 | 0 | Maharashtra | Western | Healthcare | Office | 3 | 188 |
11239 rows × 13 columns
# describe() method returns description of the data in the DataFrame (i.e. count, mean, std, etc)
df.describe()
User_ID | Age | Marital_Status | Orders | Amount | |
---|---|---|---|---|---|
count | 1.123900e+04 | 11239.000000 | 11239.000000 | 11239.000000 | 11239.000000 |
mean | 1.003004e+06 | 35.410357 | 0.420055 | 2.489634 | 9453.610553 |
std | 1.716039e+03 | 12.753866 | 0.493589 | 1.114967 | 5222.355168 |
min | 1.000001e+06 | 12.000000 | 0.000000 | 1.000000 | 188.000000 |
25% | 1.001492e+06 | 27.000000 | 0.000000 | 2.000000 | 5443.000000 |
50% | 1.003064e+06 | 33.000000 | 0.000000 | 2.000000 | 8109.000000 |
75% | 1.004426e+06 | 43.000000 | 1.000000 | 3.000000 | 12675.000000 |
max | 1.006040e+06 | 92.000000 | 1.000000 | 4.000000 | 23952.000000 |
# use describe() for specific columns
'Age', 'Orders', 'Amount']].describe() df[[
Age | Orders | Amount | |
---|---|---|---|
count | 11239.000000 | 11239.000000 | 11239.000000 |
mean | 35.410357 | 2.489634 | 9453.610553 |
std | 12.753866 | 1.114967 | 5222.355168 |
min | 12.000000 | 1.000000 | 188.000000 |
25% | 27.000000 | 2.000000 | 5443.000000 |
50% | 33.000000 | 2.000000 | 8109.000000 |
75% | 43.000000 | 3.000000 | 12675.000000 |
max | 92.000000 | 4.000000 | 23952.000000 |
Exploratory Data Analysis
Gender
# plotting a bar chart for Gender and it's count
= sns.countplot(x = 'Gender',data = df)
ax
for bars in ax.containers:
ax.bar_label(bars)
# plotting a bar chart for gender vs total amount
= df.groupby(['Gender'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False)
sales_gen
= 'Gender',y= 'Amount' ,data = sales_gen) sns.barplot(x
From above graphs we can see that most of the buyers are females and even the purchasing power of females are greater than men
Age
= sns.countplot(data = df, x = 'Age Group', hue = 'Gender')
ax
for bars in ax.containers:
ax.bar_label(bars)
# Total Amount vs Age Group
= df.groupby(['Age Group'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False)
sales_age
= 'Age Group',y= 'Amount' ,data = sales_age) sns.barplot(x
From above graphs we can see that most of the buyers are of age group between 26-35 yrs female
State
# total number of orders from top 10 states
= df.groupby(['State'], as_index=False)['Orders'].sum().sort_values(by='Orders', ascending=False).head(10)
sales_state
set(rc={'figure.figsize':(15,5)})
sns.= sales_state, x = 'State',y= 'Orders') sns.barplot(data
# total amount/sales from top 10 states
= df.groupby(['State'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False).head(10)
sales_state
set(rc={'figure.figsize':(15,5)})
sns.= sales_state, x = 'State',y= 'Amount') sns.barplot(data
From above graphs we can see that most of the orders & total sales/amount are from Uttar Pradesh, Maharashtra and Karnataka respectively
Marital Status
= sns.countplot(data = df, x = 'Marital_Status')
ax
set(rc={'figure.figsize':(7,5)})
sns.for bars in ax.containers:
ax.bar_label(bars)
= df.groupby(['Marital_Status', 'Gender'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False)
sales_state
set(rc={'figure.figsize':(6,5)})
sns.= sales_state, x = 'Marital_Status',y= 'Amount', hue='Gender') sns.barplot(data
From above graphs we can see that most of the buyers are married (women) and they have high purchasing power
Occupation
set(rc={'figure.figsize':(20,5)})
sns.= sns.countplot(data = df, x = 'Occupation')
ax
for bars in ax.containers:
ax.bar_label(bars)
= df.groupby(['Occupation'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False)
sales_state
set(rc={'figure.figsize':(20,5)})
sns.= sales_state, x = 'Occupation',y= 'Amount') sns.barplot(data
From above graphs we can see that most of the buyers are working in IT, Healthcare and Aviation sector
Product Category
set(rc={'figure.figsize':(20,5)})
sns.= sns.countplot(data = df, x = 'Product_Category')
ax
for bars in ax.containers:
ax.bar_label(bars)
= df.groupby(['Product_Category'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False).head(10)
sales_state
set(rc={'figure.figsize':(20,5)})
sns.= sales_state, x = 'Product_Category',y= 'Amount') sns.barplot(data
From above graphs we can see that most of the sold products are from Food, Clothing and Electronics category
= df.groupby(['Product_ID'], as_index=False)['Orders'].sum().sort_values(by='Orders', ascending=False).head(10)
sales_state
set(rc={'figure.figsize':(20,5)})
sns.= sales_state, x = 'Product_ID',y= 'Orders') sns.barplot(data
# top 10 most sold products (same thing as above)
= plt.subplots(figsize=(12,7))
fig1, ax1 'Product_ID')['Orders'].sum().nlargest(10).sort_values(ascending=False).plot(kind='bar') df.groupby(
Conclusion:
Married women in the 26-35 age range from the states of Uttar Pradesh, Maharashtra, and Karnataka who work in the IT, healthcare, and aviation industries are more likely to purchase items from the food, clothing, and electronics categories.
IT, హెల్త్కేర్ మరియు విమానయాన పరిశ్రమలలో పనిచేసే ఉత్తరప్రదేశ్, మహారాష్ట్ర మరియు కర్ణాటక రాష్ట్రాల నుండి 26-35 సంవత్సరాల వయస్సు గల వివాహిత మహిళలు ఆహారం, దుస్తులు మరియు ఎలక్ట్రానిక్స్ వర్గాల నుండి వస్తువులను కొనుగోలు చేసే అవకాశం ఉంది.
Thank you!